### Jonathan D. Klingler, Gary E. Hollibaugh, Jr., and Adam J. Ramey
### "Don't Know What You Got: A Bayesian Hierarchical Model of Neuroticism and Ideological Uncertainty
### Political Science Research and Methods
###
###
### Posterior Predictive Checks
### Description: This file generates Figure 5.
###
### Note 1: Run the Preprocessing_and_Summary.R file beforehand to generate the CCES2014.Rda file.
### Note 2: Run the SingleCoreX.R files beforehand to generate the hierarchical model estimates.
### Note 3: Make sure to change the directory to the one containing this file.


rm(list=ls())
invisible(gc())
library(ggplot2)
library(reshape2)
library(Cairo)
library(coda)

load("CCES2014.Rda")


# need to remove datapoints where questions were not asked
# and where personality not elicited
cces <- cces[!is.na(cces$self_emoti) & !is.na(cces$self_consc) & !is.na(cces$self_agree) 
             & !is.na(cces$self_extra) & !is.na(cces$self_openn) & !is.na(cces$CC421a) 
             & !is.na(cces$newsint) 
             & !(cces$CC334C == "Not Asked") 
             & !(cces$CC334D == "Not Asked") 
             & !(cces$CC334E == "Not Asked") 
             & !(cces$CC334F == "Not Asked") 
             & !(cces$CC334G == "Not Asked") 
             & !(cces$CC334K == "Not Asked") 
             & !(cces$CC334L == "Not Asked") 
             & !(cces$CC334M == "Not Asked") 
             & !(cces$CC334W == "Not Asked"),]

attach(cces)

##the dependent variable
obama <- CC334C
clinton <- CC334D
cruz <- CC334E
paul <- CC334F
bush <- CC334G
dem <- CC334K
rep <- CC334L
teaparty <- CC334M 
scotus <- CC334W

y <- cbind(obama,cruz,clinton,paul,bush,dem,rep,teaparty,scotus)
for (i in 1:ncol(y)) y[,i] <- as.numeric(y[,i])

y[y>=8] <- 0
y[is.na(y)] <- 0
y <- y+1

##covariates for decisiveness and the saliency intercepts
self_neuro <- 8 - self_emoti
self_openn <- (self_openn - 1)/6
self_consc <- (self_consc - 1)/6
self_extra <- (self_extra - 1)/6
self_agree <- (self_agree - 1)/6
self_neuro <- (self_neuro - 1)/6
polinterest <- (newsint=="Most of the time")*1
unkinterest <- (newsint=="Don't know")*1
faminc_recode <- faminc
faminc_recode[which(faminc_recode == "$150,000 - $199,999")] <- levels(faminc_recode)[17]
faminc_recode[which(faminc_recode == "$200,000 - $249,999")] <- levels(faminc_recode)[17]
faminc_recode[which(faminc_recode == "$250,000 - $349,999")] <- levels(faminc_recode)[17]
faminc_recode[which(faminc_recode == "$350,000 - $499,999")] <- levels(faminc_recode)[17]
faminc_recode[which(faminc_recode == "$500,000 or more")] <- levels(faminc_recode)[17]
faminc_recode[which(faminc_recode == "$250,000 or more ")] <- levels(faminc_recode)[17]
income <- factor(faminc_recode)
income_notsay <- (faminc_recode == "Prefer not to say")*1
age <- 2014 - birthyr
age2 <- (age^2)/100
black <- (race == "Black")*1
hisp <- (race == "Hispanic")*1
other <- (race == "Asian" | race == "Native American" | race == "Mixed" | race == "Other" | race == "Middle Eastern")*1
fulltime <- (employ == "Full-time")*1
parttime <- (employ == "Part-time")*1
unemploy <- (employ == "Unemployed")*1
retired <- (employ == "Retired")*1
female <- I(gender == "Female")*1
education <- as.numeric(educ)

X <- cbind(1,
           self_openn,
           self_consc,
           self_extra,
           self_agree,
           self_neuro,
           female,
           age,
           age2,
           black,
           hisp,
           other,
           education,
           polinterest,
           unkinterest,
           income,
           income_notsay,
           fulltime,
           parttime,
           unemploy,
           retired)

##covariate for saliency regression (equal to 1 if respondent and stimulus are the same party)
Xs <- cbind((CC421a=="Democrat")*1, (CC421a=="Republican")*1, 
            (CC421a=="Democrat")*1, (CC421a=="Republican")*1,
            (CC421a=="Republican")*1, (CC421a=="Democrat")*1,
            (CC421a=="Republican")*1, (CC421a=="Republican")*1,
            (CC421a=="Republican")*1)

detach(cces)


### load hierarchical model data
load("DKWYGCore1.Rda") 
load("DKWYGCore2.Rda") 
load("DKWYGCore3.Rda") 
load("DKWYGCore4.Rda") 
load("DKWYGCore5.Rda") 
load("DKWYGCore6.Rda")
load("DKWYGCore7.Rda") 
load("DKWYGCore8.Rda") 



thinning <- 4 # 12,500 per chain; 100,000 overall
a <- seq(1, dim(dkwyg.1[[1]])[1])
b <- a[seq(1, length(a), thinning)]


# coerce into a single object
DKWYG <- mcmc.list(
                   as.mcmc(dkwyg.1[[1]][b,]),
                   as.mcmc(dkwyg.2[[1]][b,]),
                   as.mcmc(dkwyg.3[[1]][b,]),
                   as.mcmc(dkwyg.4[[1]][b,]),
                   as.mcmc(dkwyg.5[[1]][b,]),
                   as.mcmc(dkwyg.6[[1]][b,]),
                   as.mcmc(dkwyg.7[[1]][b,]),
                   as.mcmc(dkwyg.8[[1]][b,])
                   )


# coerce into a separate object for testing
DKWYG.nofixed <- mcmc.list(
                           as.mcmc(dkwyg.1[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.2[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.3[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.4[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.5[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.6[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.7[[1]][b,-c(86,87)]),
                           as.mcmc(dkwyg.8[[1]][b,-c(86,87)])
                           )

# coerce the chains into a single chain for further analysis
BZ_data <- as.data.frame(runjags::combine.mcmc(DKWYG))
candNames <- c("Obama","Cruz","Clinton","Paul","Bush","Democrats",
               "Republicans","Tea Party","Supreme Court")
colnames(BZ_data)[86:94] <- candNames


# functions to create HPDs
middle95HPD <- function(x){
    out <- HPDinterval(as.mcmc(x), prob = 0.95)[,]
    names(out) <- c("ymin","ymax")
    return(out)
}

# extract the estimated coefficients for each simulation
BZ_a <- BZ_data[,grep("beta_a",colnames(BZ_data))]
BZ_b <- BZ_data[,grep("beta_b",colnames(BZ_data))]
BZ_d <- BZ_data[,grep("beta_d",colnames(BZ_data))]
BZ_e <- BZ_data[,grep("beta_e",colnames(BZ_data))]
BZ_s <- BZ_data[,grep("beta_s",colnames(BZ_data))]
BZ_tau <- BZ_data[,grep("tau",colnames(BZ_data))]
BZ_tau <- data.frame("tau[0]" = -1.5, BZ_tau)
colnames(BZ_tau) <- c("tau[0]", "tau[1]", "tau[2]", "tau[3]", "tau[4]", "tau[5]")

# extract the estimated placements for each simulation
BZ_obama <- BZ_data[,grep("Obama",colnames(BZ_data))]
BZ_cruz <- BZ_data[,grep("Cruz",colnames(BZ_data))]
BZ_clinton <- BZ_data[,grep("Clinton",colnames(BZ_data))]
BZ_paul <- BZ_data[,grep("Paul",colnames(BZ_data))]
BZ_bush <- BZ_data[,grep("Bush",colnames(BZ_data))]
BZ_dem <- BZ_data[,grep("Democrats",colnames(BZ_data))]
BZ_rep <- BZ_data[,grep("Republicans",colnames(BZ_data))]
BZ_tea <- BZ_data[,grep("Tea Party",colnames(BZ_data))]
BZ_scotus <- BZ_data[,grep("Supreme Court",colnames(BZ_data))]

# generate predicted values on the latent scale
sal_int <- X%*%t(BZ_e)
dec_mean <- X%*%t(BZ_d) 
opi_int <- X%*%t(BZ_a)
opi_slo <- X%*%t(BZ_b)
sal_slo_obama <- Xs[,1]%*%t(BZ_s)
sal_slo_cruz <- Xs[,2]%*%t(BZ_s)
sal_slo_clinton <- Xs[,3]%*%t(BZ_s)
sal_slo_paul <- Xs[,4]%*%t(BZ_s)
sal_slo_bush <- Xs[,5]%*%t(BZ_s)
sal_slo_dem <- Xs[,6]%*%t(BZ_s)
sal_slo_rep <- Xs[,7]%*%t(BZ_s)
sal_slo_tea <- Xs[,8]%*%t(BZ_s)
sal_slo_scotus <- Xs[,9]%*%t(BZ_s)

# generate predicted placements on the latent scale
mu_t_obama <- opi_int + opi_slo*BZ_obama
mu_t_clinton <- opi_int + opi_slo*BZ_clinton
mu_t_cruz <- opi_int + opi_slo*BZ_cruz
mu_t_paul <- opi_int + opi_slo*BZ_paul
mu_t_bush <- opi_int + opi_slo*BZ_bush
mu_t_dem <- opi_int + opi_slo*BZ_dem
mu_t_rep <- opi_int + opi_slo*BZ_rep
mu_t_tea <- opi_int + opi_slo*BZ_tea
mu_t_scotus <- opi_int + opi_slo*BZ_scotus

# remove the raw opinion coefficients since we don't need them anymore 
# and they take up LOTS of memory!
# otherwise it's very likely you run out of memory
rm(opi_int)
rm(opi_slo)
rm(BZ_obama)
rm(BZ_clinton)
rm(BZ_cruz)
rm(BZ_paul)
rm(BZ_bush)
rm(BZ_dem)
rm(BZ_rep)
rm(BZ_tea)
rm(BZ_scotus)


# create predicted saliency on the latent scale
mu_s_obama <- sal_int + sal_slo_obama
mu_s_clinton <- sal_int + sal_slo_clinton
mu_s_cruz <- sal_int + sal_slo_cruz
mu_s_paul <- sal_int + sal_slo_paul
mu_s_bush <- sal_int + sal_slo_bush
mu_s_dem <- sal_int + sal_slo_dem
mu_s_rep <- sal_int + sal_slo_rep
mu_s_tea <- sal_int + sal_slo_tea
mu_s_scotus <- sal_int + sal_slo_scotus

# remove the raw saliency coefficients since we don't need them anymore 
rm(sal_int)
rm(sal_slo_obama)
rm(sal_slo_clinton)
rm(sal_slo_cruz)
rm(sal_slo_paul)
rm(sal_slo_bush)
rm(sal_slo_dem)
rm(sal_slo_rep)
rm(sal_slo_tea)
rm(sal_slo_scotus)

# create saliency probabilities for each simulation
prob_s_obama <- pnorm(-mu_s_obama)
prob_s_clinton <- pnorm(-mu_s_clinton)
prob_s_cruz <- pnorm(-mu_s_cruz)
prob_s_paul <- pnorm(-mu_s_paul)
prob_s_bush <- pnorm(-mu_s_bush)
prob_s_dem <- pnorm(-mu_s_dem)
prob_s_rep <- pnorm(-mu_s_rep)
prob_s_tea <- pnorm(-mu_s_tea)
prob_s_scotus <- pnorm(-mu_s_scotus)

# remove the saliency coefficients on the latent scale
# since we now have the saliency probabilities, which is 
# what we need
rm(mu_s_obama)
rm(mu_s_clinton)
rm(mu_s_cruz)
rm(mu_s_paul)
rm(mu_s_bush)
rm(mu_s_dem)
rm(mu_s_rep)
rm(mu_s_tea)
rm(mu_s_scotus)

# calculate decisiveness for each individual for each simulation
prob_decisive <- pnorm(-dec_mean)

# remove decisiveness on the latent scale, keeping the probabilities
rm(dec_mean)

# calculate the probability of believing obama to be in a particular category
prob1_obama <- pnorm(BZ_tau[,1] - mu_t_obama)
prob2_obama <- pnorm(BZ_tau[,2] - mu_t_obama) - pnorm(BZ_tau[,1] - mu_t_obama)
prob3_obama <- pnorm(BZ_tau[,3] - mu_t_obama) - pnorm(BZ_tau[,2] - mu_t_obama)
prob4_obama <- pnorm(BZ_tau[,4] - mu_t_obama) - pnorm(BZ_tau[,3] - mu_t_obama)
prob5_obama <- pnorm(BZ_tau[,5] - mu_t_obama) - pnorm(BZ_tau[,4] - mu_t_obama)
prob6_obama <- pnorm(BZ_tau[,6] - mu_t_obama) - pnorm(BZ_tau[,5] - mu_t_obama)
prob7_obama <- 1 - pnorm(BZ_tau[,6] - mu_t_obama)

# calculate the probability of RESPONDING that obama is in a given category
# this is what we care about
prob_NADK_obama <- prob_s_obama + (1-prob_s_obama)*prob_decisive*(prob3_obama + prob4_obama + prob5_obama)
prob_1_obama <- (1-prob_s_obama)*prob1_obama
prob_2_obama <- (1-prob_s_obama)*prob2_obama
prob_3_obama <- (1-prob_s_obama)*prob3_obama*(1-prob_decisive)
prob_4_obama <- (1-prob_s_obama)*prob4_obama*(1-prob_decisive)
prob_5_obama <- (1-prob_s_obama)*prob5_obama*(1-prob_decisive)
prob_6_obama <- (1-prob_s_obama)*prob6_obama
prob_7_obama <- (1-prob_s_obama)*prob7_obama

# remove the underlying belief probabilities
rm(prob1_obama)
rm(prob2_obama)
rm(prob3_obama)
rm(prob4_obama)
rm(prob5_obama)
rm(prob6_obama)
rm(prob7_obama)
rm(prob_s_obama)

# mean probability for each simulation  
# aggregating over individuals within simulation                        
obama_agg_means <- cbind(apply(prob_NADK_obama,2,mean),
                         apply(prob_1_obama,2,mean),
                         apply(prob_2_obama,2,mean),
                         apply(prob_3_obama,2,mean),
                         apply(prob_4_obama,2,mean),
                         apply(prob_5_obama,2,mean),
                         apply(prob_6_obama,2,mean),
                         apply(prob_7_obama,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_obama)
rm(prob_1_obama)
rm(prob_2_obama)
rm(prob_3_obama)
rm(prob_4_obama)
rm(prob_5_obama)
rm(prob_6_obama)
rm(prob_7_obama)

# garbage collection to free up more memory
invisible(gc())

    
# calculate the probability of believing clinton to be in a particular category    
prob1_clinton <- pnorm(BZ_tau[,1] - mu_t_clinton)
prob2_clinton <- pnorm(BZ_tau[,2] - mu_t_clinton) - pnorm(BZ_tau[,1] - mu_t_clinton)
prob3_clinton <- pnorm(BZ_tau[,3] - mu_t_clinton) - pnorm(BZ_tau[,2] - mu_t_clinton)
prob4_clinton <- pnorm(BZ_tau[,4] - mu_t_clinton) - pnorm(BZ_tau[,3] - mu_t_clinton)
prob5_clinton <- pnorm(BZ_tau[,5] - mu_t_clinton) - pnorm(BZ_tau[,4] - mu_t_clinton)
prob6_clinton <- pnorm(BZ_tau[,6] - mu_t_clinton) - pnorm(BZ_tau[,5] - mu_t_clinton)
prob7_clinton <- 1 - pnorm(BZ_tau[,6] - mu_t_clinton)

# calculate the probability of RESPONDING that clinton is in a given category
# this is what we care about
prob_NADK_clinton <- prob_s_clinton + (1-prob_s_clinton)*prob_decisive*(prob3_clinton + prob4_clinton + prob5_clinton)
prob_1_clinton <- (1-prob_s_clinton)*prob1_clinton
prob_2_clinton <- (1-prob_s_clinton)*prob2_clinton
prob_3_clinton <- (1-prob_s_clinton)*prob3_clinton*(1-prob_decisive)
prob_4_clinton <- (1-prob_s_clinton)*prob4_clinton*(1-prob_decisive)
prob_5_clinton <- (1-prob_s_clinton)*prob5_clinton*(1-prob_decisive)
prob_6_clinton <- (1-prob_s_clinton)*prob6_clinton
prob_7_clinton <- (1-prob_s_clinton)*prob7_clinton

# remove the underlying belief probabilities
rm(prob1_clinton)
rm(prob2_clinton)
rm(prob3_clinton)
rm(prob4_clinton)
rm(prob5_clinton)
rm(prob6_clinton)
rm(prob7_clinton)
rm(prob_s_clinton)
                         

# mean probability for each simulation  
# aggregating over individuals within simulation                        
clinton_agg_means <- cbind(apply(prob_NADK_clinton,2,mean),
                         apply(prob_1_clinton,2,mean),
                         apply(prob_2_clinton,2,mean),
                         apply(prob_3_clinton,2,mean),
                         apply(prob_4_clinton,2,mean),
                         apply(prob_5_clinton,2,mean),
                         apply(prob_6_clinton,2,mean),
                         apply(prob_7_clinton,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_clinton)
rm(prob_1_clinton)
rm(prob_2_clinton)
rm(prob_3_clinton)
rm(prob_4_clinton)
rm(prob_5_clinton)
rm(prob_6_clinton)
rm(prob_7_clinton)

# garbage collection to free up more memory
invisible(gc())


# calculate the probability of believing cruz to be in a particular category              
prob1_cruz <- pnorm(BZ_tau[,1] - mu_t_cruz)
prob2_cruz <- pnorm(BZ_tau[,2] - mu_t_cruz) - pnorm(BZ_tau[,1] - mu_t_cruz)
prob3_cruz <- pnorm(BZ_tau[,3] - mu_t_cruz) - pnorm(BZ_tau[,2] - mu_t_cruz)
prob4_cruz <- pnorm(BZ_tau[,4] - mu_t_cruz) - pnorm(BZ_tau[,3] - mu_t_cruz)
prob5_cruz <- pnorm(BZ_tau[,5] - mu_t_cruz) - pnorm(BZ_tau[,4] - mu_t_cruz)
prob6_cruz <- pnorm(BZ_tau[,6] - mu_t_cruz) - pnorm(BZ_tau[,5] - mu_t_cruz)
prob7_cruz <- 1 - pnorm(BZ_tau[,6] - mu_t_cruz)


# calculate the probability of RESPONDING that cruz is in a given category
# this is what we care about
prob_NADK_cruz <- prob_s_cruz + (1-prob_s_cruz)*prob_decisive*(prob3_cruz + prob4_cruz + prob5_cruz)
prob_1_cruz <- (1-prob_s_cruz)*prob1_cruz
prob_2_cruz <- (1-prob_s_cruz)*prob2_cruz
prob_3_cruz <- (1-prob_s_cruz)*prob3_cruz*(1-prob_decisive)
prob_4_cruz <- (1-prob_s_cruz)*prob4_cruz*(1-prob_decisive)
prob_5_cruz <- (1-prob_s_cruz)*prob5_cruz*(1-prob_decisive)
prob_6_cruz <- (1-prob_s_cruz)*prob6_cruz
prob_7_cruz <- (1-prob_s_cruz)*prob7_cruz

                          
# remove the underlying belief probabilities
rm(prob1_cruz)
rm(prob2_cruz)
rm(prob3_cruz)
rm(prob4_cruz)
rm(prob5_cruz)
rm(prob6_cruz)
rm(prob7_cruz)
rm(prob_s_cruz)
                         

# mean probability for each simulation  
# aggregating over individuals within simulation                        
cruz_agg_means <- cbind(apply(prob_NADK_cruz,2,mean),
                         apply(prob_1_cruz,2,mean),
                         apply(prob_2_cruz,2,mean),
                         apply(prob_3_cruz,2,mean),
                         apply(prob_4_cruz,2,mean),
                         apply(prob_5_cruz,2,mean),
                         apply(prob_6_cruz,2,mean),
                         apply(prob_7_cruz,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_cruz)
rm(prob_1_cruz)
rm(prob_2_cruz)
rm(prob_3_cruz)
rm(prob_4_cruz)
rm(prob_5_cruz)
rm(prob_6_cruz)
rm(prob_7_cruz)

# garbage collection to free up more memory
invisible(gc())
                         
            
# calculate the probability of believing paul to be in a particular category                  
prob1_paul <- pnorm(BZ_tau[,1] - mu_t_paul)
prob2_paul <- pnorm(BZ_tau[,2] - mu_t_paul) - pnorm(BZ_tau[,1] - mu_t_paul)
prob3_paul <- pnorm(BZ_tau[,3] - mu_t_paul) - pnorm(BZ_tau[,2] - mu_t_paul)
prob4_paul <- pnorm(BZ_tau[,4] - mu_t_paul) - pnorm(BZ_tau[,3] - mu_t_paul)
prob5_paul <- pnorm(BZ_tau[,5] - mu_t_paul) - pnorm(BZ_tau[,4] - mu_t_paul)
prob6_paul <- pnorm(BZ_tau[,6] - mu_t_paul) - pnorm(BZ_tau[,5] - mu_t_paul)
prob7_paul <- 1 - pnorm(BZ_tau[,6] - mu_t_paul)


# calculate the probability of RESPONDING that paul is in a given category
# this is what we care about
prob_NADK_paul <- prob_s_paul + (1-prob_s_paul)*prob_decisive*(prob3_paul + prob4_paul + prob5_paul)
prob_1_paul <- (1-prob_s_paul)*prob1_paul
prob_2_paul <- (1-prob_s_paul)*prob2_paul
prob_3_paul <- (1-prob_s_paul)*prob3_paul*(1-prob_decisive)
prob_4_paul <- (1-prob_s_paul)*prob4_paul*(1-prob_decisive)
prob_5_paul <- (1-prob_s_paul)*prob5_paul*(1-prob_decisive)
prob_6_paul <- (1-prob_s_paul)*prob6_paul
prob_7_paul <- (1-prob_s_paul)*prob7_paul

                
# remove the underlying belief probabilities                          
rm(prob1_paul)
rm(prob2_paul)
rm(prob3_paul)
rm(prob4_paul)
rm(prob5_paul)
rm(prob6_paul)
rm(prob7_paul)
rm(prob_s_paul)


# mean probability for each simulation  
# aggregating over individuals within simulation                        
paul_agg_means <- cbind(apply(prob_NADK_paul,2,mean),
                        apply(prob_1_paul,2,mean),
                        apply(prob_2_paul,2,mean),
                        apply(prob_3_paul,2,mean),
                        apply(prob_4_paul,2,mean),
                        apply(prob_5_paul,2,mean),
                        apply(prob_6_paul,2,mean),
                        apply(prob_7_paul,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_paul)
rm(prob_1_paul)
rm(prob_2_paul)
rm(prob_3_paul)
rm(prob_4_paul)
rm(prob_5_paul)
rm(prob_6_paul)
rm(prob_7_paul)

# garbage collection to free up more memory
invisible(gc())
       


# calculate the probability of believing bush to be in a particular category                      
prob1_bush <- pnorm(BZ_tau[,1] - mu_t_bush)
prob2_bush <- pnorm(BZ_tau[,2] - mu_t_bush) - pnorm(BZ_tau[,1] - mu_t_bush)
prob3_bush <- pnorm(BZ_tau[,3] - mu_t_bush) - pnorm(BZ_tau[,2] - mu_t_bush)
prob4_bush <- pnorm(BZ_tau[,4] - mu_t_bush) - pnorm(BZ_tau[,3] - mu_t_bush)
prob5_bush <- pnorm(BZ_tau[,5] - mu_t_bush) - pnorm(BZ_tau[,4] - mu_t_bush)
prob6_bush <- pnorm(BZ_tau[,6] - mu_t_bush) - pnorm(BZ_tau[,5] - mu_t_bush)
prob7_bush <- 1 - pnorm(BZ_tau[,6] - mu_t_bush)

# calculate the probability of RESPONDING that bush is in a given category
# this is what we care about
prob_NADK_bush <- prob_s_bush + (1-prob_s_bush)*prob_decisive*(prob3_bush + prob4_bush + prob5_bush)
prob_1_bush <- (1-prob_s_bush)*prob1_bush
prob_2_bush <- (1-prob_s_bush)*prob2_bush
prob_3_bush <- (1-prob_s_bush)*prob3_bush*(1-prob_decisive)
prob_4_bush <- (1-prob_s_bush)*prob4_bush*(1-prob_decisive)
prob_5_bush <- (1-prob_s_bush)*prob5_bush*(1-prob_decisive)
prob_6_bush <- (1-prob_s_bush)*prob6_bush
prob_7_bush <- (1-prob_s_bush)*prob7_bush

# remove the underlying belief probabilities                                                    
rm(prob1_bush)
rm(prob2_bush)
rm(prob3_bush)
rm(prob4_bush)
rm(prob5_bush)
rm(prob6_bush)
rm(prob7_bush)
rm(prob_s_bush)

# mean probability for each simulation  
# aggregating over individuals within simulation                        
bush_agg_means <- cbind(apply(prob_NADK_bush,2,mean),
                         apply(prob_1_bush,2,mean),
                         apply(prob_2_bush,2,mean),
                         apply(prob_3_bush,2,mean),
                         apply(prob_4_bush,2,mean),
                         apply(prob_5_bush,2,mean),
                         apply(prob_6_bush,2,mean),
                         apply(prob_7_bush,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_bush)
rm(prob_1_bush)
rm(prob_2_bush)
rm(prob_3_bush)
rm(prob_4_bush)
rm(prob_5_bush)
rm(prob_6_bush)
rm(prob_7_bush)

# garbage collection to free up more memory
invisible(gc())
       


# calculate the probability of believing the democratic party to be in a particular category                  
prob1_dem <- pnorm(BZ_tau[,1] - mu_t_dem)
prob2_dem <- pnorm(BZ_tau[,2] - mu_t_dem) - pnorm(BZ_tau[,1] - mu_t_dem)
prob3_dem <- pnorm(BZ_tau[,3] - mu_t_dem) - pnorm(BZ_tau[,2] - mu_t_dem)
prob4_dem <- pnorm(BZ_tau[,4] - mu_t_dem) - pnorm(BZ_tau[,3] - mu_t_dem)
prob5_dem <- pnorm(BZ_tau[,5] - mu_t_dem) - pnorm(BZ_tau[,4] - mu_t_dem)
prob6_dem <- pnorm(BZ_tau[,6] - mu_t_dem) - pnorm(BZ_tau[,5] - mu_t_dem)
prob7_dem <- 1 - pnorm(BZ_tau[,6] - mu_t_dem)


# calculate the probability of RESPONDING that the democratic party is in a given category
# this is what we care about
prob_NADK_dem <- prob_s_dem + (1-prob_s_dem)*prob_decisive*(prob3_dem + prob4_dem + prob5_dem)
prob_1_dem <- (1-prob_s_dem)*prob1_dem
prob_2_dem <- (1-prob_s_dem)*prob2_dem
prob_3_dem <- (1-prob_s_dem)*prob3_dem*(1-prob_decisive)
prob_4_dem <- (1-prob_s_dem)*prob4_dem*(1-prob_decisive)
prob_5_dem <- (1-prob_s_dem)*prob5_dem*(1-prob_decisive)
prob_6_dem <- (1-prob_s_dem)*prob6_dem
prob_7_dem <- (1-prob_s_dem)*prob7_dem

                          
# remove the underlying belief probabilities                                                    
rm(prob1_dem)
rm(prob2_dem)
rm(prob3_dem)
rm(prob4_dem)
rm(prob5_dem)
rm(prob6_dem)
rm(prob7_dem)
rm(prob_s_dem)

# mean probability for each simulation  
# aggregating over individuals within simulation                        
dem_agg_means <- cbind(apply(prob_NADK_dem,2,mean),
                         apply(prob_1_dem,2,mean),
                         apply(prob_2_dem,2,mean),
                         apply(prob_3_dem,2,mean),
                         apply(prob_4_dem,2,mean),
                         apply(prob_5_dem,2,mean),
                         apply(prob_6_dem,2,mean),
                         apply(prob_7_dem,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_dem)
rm(prob_1_dem)
rm(prob_2_dem)
rm(prob_3_dem)
rm(prob_4_dem)
rm(prob_5_dem)
rm(prob_6_dem)
rm(prob_7_dem)

# garbage collection to free up more memory
invisible(gc())


    
# calculate the probability of believing the republican party to be in a particular category                  
prob1_rep <- pnorm(BZ_tau[,1] - mu_t_rep)
prob2_rep <- pnorm(BZ_tau[,2] - mu_t_rep) - pnorm(BZ_tau[,1] - mu_t_rep)
prob3_rep <- pnorm(BZ_tau[,3] - mu_t_rep) - pnorm(BZ_tau[,2] - mu_t_rep)
prob4_rep <- pnorm(BZ_tau[,4] - mu_t_rep) - pnorm(BZ_tau[,3] - mu_t_rep)
prob5_rep <- pnorm(BZ_tau[,5] - mu_t_rep) - pnorm(BZ_tau[,4] - mu_t_rep)
prob6_rep <- pnorm(BZ_tau[,6] - mu_t_rep) - pnorm(BZ_tau[,5] - mu_t_rep)
prob7_rep <- 1 - pnorm(BZ_tau[,6] - mu_t_rep)


# calculate the probability of RESPONDING that the GOP is in a given category
# this is what we care about
prob_NADK_rep <- prob_s_rep + (1-prob_s_rep)*prob_decisive*(prob3_rep + prob4_rep + prob5_rep)
prob_1_rep <- (1-prob_s_rep)*prob1_rep
prob_2_rep <- (1-prob_s_rep)*prob2_rep
prob_3_rep <- (1-prob_s_rep)*prob3_rep*(1-prob_decisive)
prob_4_rep <- (1-prob_s_rep)*prob4_rep*(1-prob_decisive)
prob_5_rep <- (1-prob_s_rep)*prob5_rep*(1-prob_decisive)
prob_6_rep <- (1-prob_s_rep)*prob6_rep
prob_7_rep <- (1-prob_s_rep)*prob7_rep

                          
# remove the underlying belief probabilities                                                    
rm(prob1_rep)
rm(prob2_rep)
rm(prob3_rep)
rm(prob4_rep)
rm(prob5_rep)
rm(prob6_rep)
rm(prob7_rep)
rm(prob_s_rep)


# mean probability for each simulation  
# aggregating over individuals within simulation                        
rep_agg_means <- cbind(apply(prob_NADK_rep,2,mean),
                         apply(prob_1_rep,2,mean),
                         apply(prob_2_rep,2,mean),
                         apply(prob_3_rep,2,mean),
                         apply(prob_4_rep,2,mean),
                         apply(prob_5_rep,2,mean),
                         apply(prob_6_rep,2,mean),
                         apply(prob_7_rep,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_rep)
rm(prob_1_rep)
rm(prob_2_rep)
rm(prob_3_rep)
rm(prob_4_rep)
rm(prob_5_rep)
rm(prob_6_rep)
rm(prob_7_rep)

# garbage collection to free up more memory
invisible(gc())
       

    
# calculate the probability of believing the tea party to be in a particular category                  
prob1_tea <- pnorm(BZ_tau[,1] - mu_t_tea)
prob2_tea <- pnorm(BZ_tau[,2] - mu_t_tea) - pnorm(BZ_tau[,1] - mu_t_tea)
prob3_tea <- pnorm(BZ_tau[,3] - mu_t_tea) - pnorm(BZ_tau[,2] - mu_t_tea)
prob4_tea <- pnorm(BZ_tau[,4] - mu_t_tea) - pnorm(BZ_tau[,3] - mu_t_tea)
prob5_tea <- pnorm(BZ_tau[,5] - mu_t_tea) - pnorm(BZ_tau[,4] - mu_t_tea)
prob6_tea <- pnorm(BZ_tau[,6] - mu_t_tea) - pnorm(BZ_tau[,5] - mu_t_tea)
prob7_tea <- 1 - pnorm(BZ_tau[,6] - mu_t_tea)


# calculate the probability of RESPONDING that the tea party is in a given category
# this is what we care about
prob_NADK_tea <- prob_s_tea + (1-prob_s_tea)*prob_decisive*(prob3_tea + prob4_tea + prob5_tea)
prob_1_tea <- (1-prob_s_tea)*prob1_tea
prob_2_tea <- (1-prob_s_tea)*prob2_tea
prob_3_tea <- (1-prob_s_tea)*prob3_tea*(1-prob_decisive)
prob_4_tea <- (1-prob_s_tea)*prob4_tea*(1-prob_decisive)
prob_5_tea <- (1-prob_s_tea)*prob5_tea*(1-prob_decisive)
prob_6_tea <- (1-prob_s_tea)*prob6_tea
prob_7_tea <- (1-prob_s_tea)*prob7_tea

                          
# remove the underlying belief probabilities                                                    
rm(prob1_tea)
rm(prob2_tea)
rm(prob3_tea)
rm(prob4_tea)
rm(prob5_tea)
rm(prob6_tea)
rm(prob7_tea)
rm(prob_s_tea)


# mean probability for each simulation  
# aggregating over individuals within simulation                        
tea_agg_means <- cbind(apply(prob_NADK_tea,2,mean),
                         apply(prob_1_tea,2,mean),
                         apply(prob_2_tea,2,mean),
                         apply(prob_3_tea,2,mean),
                         apply(prob_4_tea,2,mean),
                         apply(prob_5_tea,2,mean),
                         apply(prob_6_tea,2,mean),
                         apply(prob_7_tea,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_tea)
rm(prob_1_tea)
rm(prob_2_tea)
rm(prob_3_tea)
rm(prob_4_tea)
rm(prob_5_tea)
rm(prob_6_tea)
rm(prob_7_tea)

# garbage collection to free up more memory
invisible(gc())
       

    
# calculate the probability of believing the supreme court to be in a particular category                  
prob1_scotus <- pnorm(BZ_tau[,1] - mu_t_scotus)
prob2_scotus <- pnorm(BZ_tau[,2] - mu_t_scotus) - pnorm(BZ_tau[,1] - mu_t_scotus)
prob3_scotus <- pnorm(BZ_tau[,3] - mu_t_scotus) - pnorm(BZ_tau[,2] - mu_t_scotus)
prob4_scotus <- pnorm(BZ_tau[,4] - mu_t_scotus) - pnorm(BZ_tau[,3] - mu_t_scotus)
prob5_scotus <- pnorm(BZ_tau[,5] - mu_t_scotus) - pnorm(BZ_tau[,4] - mu_t_scotus)
prob6_scotus <- pnorm(BZ_tau[,6] - mu_t_scotus) - pnorm(BZ_tau[,5] - mu_t_scotus)
prob7_scotus <- 1 - pnorm(BZ_tau[,6] - mu_t_scotus)


# calculate the probability of RESPONDING that the supreme court is in a given category
# this is what we care about
prob_NADK_scotus <- prob_s_scotus + (1-prob_s_scotus)*prob_decisive*(prob3_scotus + prob4_scotus + prob5_scotus)
prob_1_scotus <- (1-prob_s_scotus)*prob1_scotus
prob_2_scotus <- (1-prob_s_scotus)*prob2_scotus
prob_3_scotus <- (1-prob_s_scotus)*prob3_scotus*(1-prob_decisive)
prob_4_scotus <- (1-prob_s_scotus)*prob4_scotus*(1-prob_decisive)
prob_5_scotus <- (1-prob_s_scotus)*prob5_scotus*(1-prob_decisive)
prob_6_scotus <- (1-prob_s_scotus)*prob6_scotus
prob_7_scotus <- (1-prob_s_scotus)*prob7_scotus

                          
# remove the underlying belief probabilities                                                    
rm(prob1_scotus)
rm(prob2_scotus)
rm(prob3_scotus)
rm(prob4_scotus)
rm(prob5_scotus)
rm(prob6_scotus)
rm(prob7_scotus)
rm(prob_s_scotus)


# mean probability for each simulation  
# aggregating over individuals within simulation                        
scotus_agg_means <- cbind(apply(prob_NADK_scotus,2,mean),
                         apply(prob_1_scotus,2,mean),
                         apply(prob_2_scotus,2,mean),
                         apply(prob_3_scotus,2,mean),
                         apply(prob_4_scotus,2,mean),
                         apply(prob_5_scotus,2,mean),
                         apply(prob_6_scotus,2,mean),
                         apply(prob_7_scotus,2,mean))

# remove the underlying respondent-simulation probabilities
# all 73500000 of them!
rm(prob_NADK_scotus)
rm(prob_1_scotus)
rm(prob_2_scotus)
rm(prob_3_scotus)
rm(prob_4_scotus)
rm(prob_5_scotus)
rm(prob_6_scotus)
rm(prob_7_scotus)

# garbage collection to free up more memory
invisible(gc())
     
     
     
       

# get the response ranks per stimulus for each simulation    
obama_ranks <- t(apply(obama_agg_means,1,rank))    
clinton_ranks <- t(apply(clinton_agg_means,1,rank))    
cruz_ranks <- t(apply(cruz_agg_means,1,rank))        
paul_ranks <- t(apply(paul_agg_means,1,rank))        
bush_ranks <- t(apply(bush_agg_means,1,rank))        
dem_ranks <- t(apply(dem_agg_means,1,rank))        
rep_ranks <- t(apply(rep_agg_means,1,rank))        
tea_ranks <- t(apply(tea_agg_means,1,rank))        
scotus_ranks <- t(apply(scotus_agg_means,1,rank))        


# remove the mean probability matrices to free up memory
rm(obama_agg_means)
rm(clinton_agg_means)
rm(cruz_agg_means)
rm(paul_agg_means)
rm(bush_agg_means)
rm(dem_agg_means)
rm(rep_agg_means)
rm(tea_agg_means)
rm(scotus_agg_means)

# function to calculate the mode
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}


# calculate the actual response ranks for the empirical data, and the modal response ranks for the simulations
# first, for the pooled data
ranks.df <- data.frame(empirics = c(rank(data.frame(table(y[,"obama"])/length(y[,"obama"]))[,2], ties.method = "average"), 
                                    rank(data.frame(table(y[,"clinton"])/length(y[,"clinton"]))[,2], ties.method = "average"), 
                                    rank(data.frame(table(y[,"cruz"])/length(y[,"cruz"]))[,2], ties.method = "average"), 
                                    rank(data.frame(table(y[,"paul"])/length(y[,"paul"]))[,2], ties.method = "average"), 
                                    rank(data.frame(table(y[,"bush"])/length(y[,"bush"]))[,2], ties.method = "average"),
                                    rank(data.frame(table(y[,"dem"])/length(y[,"dem"]))[,2], ties.method = "average"),
                                    rank(data.frame(table(y[,"rep"])/length(y[,"rep"]))[,2], ties.method = "average"),
                                    rank(data.frame(table(y[,"teaparty"])/length(y[,"teaparty"]))[,2], ties.method = "average"),
                                    rank(data.frame(table(y[,"scotus"])/length(y[,"scotus"]))[,2], ties.method = "average")), 
                       estimates = c(apply(obama_ranks,2, Mode), apply(clinton_ranks,2, Mode), 
                                     apply(cruz_ranks,2, Mode), apply(paul_ranks,2, Mode),
                                     apply(bush_ranks,2, Mode), apply(dem_ranks,2, Mode),
                                     apply(rep_ranks,2, Mode), apply(tea_ranks,2, Mode),
                                     apply(scotus_ranks,2, Mode)))

# then the individual stimuli
obama.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"obama"])/length(y[,"obama"]))[,2], ties.method = "average"), 
                             estimates = apply(obama_ranks,2,Mode))

clinton.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"clinton"])/length(y[,"clinton"]))[,2], ties.method = "average"), 
                               estimates = apply(clinton_ranks,2,Mode))

cruz.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"cruz"])/length(y[,"cruz"]))[,2], ties.method = "average"), 
                            estimates = apply(cruz_ranks,2,Mode))

paul.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"paul"])/length(y[,"paul"]))[,2], ties.method = "average"), 
                            estimates = apply(paul_ranks,2,Mode))

bush.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"bush"])/length(y[,"bush"]))[,2], ties.method = "average"), 
                            estimates = apply(bush_ranks,2,Mode))

dem.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"dem"])/length(y[,"dem"]))[,2], ties.method = "average"), 
                           estimates = apply(dem_ranks,2,Mode))

rep.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"rep"])/length(y[,"rep"]))[,2], ties.method = "average"), 
                           estimates = apply(rep_ranks,2,Mode))

tea.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"teaparty"])/length(y[,"teaparty"]))[,2], ties.method = "average"), 
                           estimates = apply(tea_ranks,2,Mode))

scotus.ranks.df <- data.frame(empirics = rank(data.frame(table(y[,"scotus"])/length(y[,"scotus"]))[,2], ties.method = "average"), 
                              estimates = apply(scotus_ranks,2,Mode))
               

# combine everything into a massive data frame for plotting
big.ranks.frame <- rbind(cbind(ranks.df, "Stimuli" = "All Stimuli"),
                         cbind(obama.ranks.df, "Stimuli" = "Obama"),
                         cbind(cruz.ranks.df, "Stimuli" = "Cruz"),
                         cbind(clinton.ranks.df, "Stimuli" = "Clinton"),
                         cbind(paul.ranks.df, "Stimuli" = "Paul"),
                         cbind(bush.ranks.df, "Stimuli" = "Bush"),
                         cbind(dem.ranks.df, "Stimuli" = "Democratic Party"),
                         cbind(rep.ranks.df, "Stimuli" = "Republican Party"),
                         cbind(tea.ranks.df, "Stimuli" = "Tea Party"),
                         cbind(scotus.ranks.df, "Stimuli" = "Supreme Court"))                                                                                                                                    

# extract the pearson's correlations between empirics and estimates
ranks.cor <- c(round(sqrt(summary(lm(ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(obama.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(cruz.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(clinton.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(paul.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(bush.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(dem.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(rep.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(tea.ranks.df))$r.squared), digits = 3),
               round(sqrt(summary(lm(scotus.ranks.df))$r.squared), digits = 3))

# extract the rank correlations between empirics and estimates
ranks.tau <- c(round(cor.test(ranks.df[,1], ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(obama.ranks.df[,1], obama.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(cruz.ranks.df[,1], cruz.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(clinton.ranks.df[,1], clinton.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(paul.ranks.df[,1], paul.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(bush.ranks.df[,1], bush.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(dem.ranks.df[,1], dem.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(rep.ranks.df[,1], rep.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(tea.ranks.df[,1], tea.ranks.df[,2], method="kendall")$estimate,3),
               round(cor.test(scotus.ranks.df[,1], scotus.ranks.df[,2], method="kendall")$estimate,3))

# extract the p values for the rank correlations (not used)
ranks.tau.p <- c(round(cor.test(ranks.df[,1], ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(obama.ranks.df[,1], obama.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(cruz.ranks.df[,1], cruz.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(clinton.ranks.df[,1], clinton.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(paul.ranks.df[,1], paul.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(bush.ranks.df[,1], bush.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(dem.ranks.df[,1], dem.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(rep.ranks.df[,1], rep.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(tea.ranks.df[,1], tea.ranks.df[,2], method="kendall")$p.value,3),
                round(cor.test(scotus.ranks.df[,1], scotus.ranks.df[,2], method="kendall")$p.value,3))

# put the correlations into their own data frames for plotting later
ranks.cor <- data.frame(Stimuli = levels(big.ranks.frame$Stimuli), Correlation = ranks.cor) 
ranks.tau.plot <- data.frame(Stimuli = levels(big.ranks.frame$Stimuli), Tau = ranks.tau, p = ranks.tau.p) 


# plot it all!                                         
big.ranks.tau <- ggplot(big.ranks.frame, aes(x= empirics,y=estimates)) + 
                    geom_point(alpha = 0.3) +
                    theme_bw() + 
                    stat_smooth(color = "black", size = 0.5, level = 0.95, method = "lm", aes(linetype = "Linear Fit"), se = FALSE) + 
                    stat_smooth(color = "black", size = 0.5, level = 0.95, method = "loess", aes(linetype = "LOESS Fit"), se = FALSE) + 
                    geom_abline(intercept = 0) + 
                    xlab('\nActual Rankings of Response Prevalence') + 
                    ylab('Predicted Rankings of Response Prevalence\n') + 
                    scale_x_continuous(breaks = c(1:8), limits = c(1,8)) + 
                    scale_y_continuous(breaks = c(1:8)) +
                    facet_wrap(~Stimuli, ncol = 5) + 
                    geom_text(data= ranks.tau.plot, aes(x=6, y=0.5, label = paste("list(tau == ","`",Tau,"`",")",sep=""), group = NULL), parse=TRUE) +
                    geom_text(data= ranks.cor, aes(x=6, y=1.25, label = paste("list(italic(r) == ","`", Correlation,"`",")",sep=""), group = NULL), parse=TRUE) +
                    scale_linetype_manual(name="Fit Method", values=c("Linear Fit"="dashed", "LOESS Fit"="dotted"))  
                

# save to an .eps file
cairo_ps("Figures/rank-plot-tau.eps", height = 5, width = 10)
big.ranks.tau
dev.off()                         